library(plotly)
library(readxl)
library(tidyverse)
library(mapdata)
library(gridExtra)
library(pheatmap)
options(scipen = 999)
EUSuperstore <- read_excel("EUSuperstore.xls")
The summary of numeric variables in this data set is:
graph_profit <- ggplot(EUSuperstore, aes(y = Profit)) +
geom_boxplot()
graph_sales <- ggplot(EUSuperstore, aes(y = Sales)) +
geom_boxplot()
graph_discount <- ggplot(EUSuperstore, aes(y = Discount)) +
geom_boxplot()
graph_quantity <- ggplot(EUSuperstore, aes(y = Quantity)) +
geom_boxplot()
grid.arrange(graph_sales, graph_profit,graph_discount,graph_quantity, ncol = 4)
summary(EUSuperstore$Sales)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.955 49.462 119.355 293.809 320.709 7958.580
summary(EUSuperstore$Quantity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 3.777 5.000 14.000
summary(EUSuperstore$Discount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1031 0.1000 0.8500
summary(EUSuperstore$Profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3059.82 1.32 14.22 37.28 48.51 3979.08
Are there any Null values?
EUSuperstore |>
janitor::clean_names()
## # A tibble: 10,000 x 20
## row_id order_id order_date ship_date ship_mode customer_id
## <dbl> <chr> <dttm> <dttm> <chr> <chr>
## 1 1 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 2 2 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 3 3 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 4 4 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 5 5 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 6 6 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 7 7 ES-2018~ 2018-02-07 00:00:00 2018-02-11 00:00:00 Standard~ AS-10045
## 8 8 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## 9 9 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## 10 10 ES-2016~ 2016-08-02 00:00:00 2016-08-07 00:00:00 Second C~ EB-13840
## # ... with 9,990 more rows, and 14 more variables: customer_name <chr>,
## # segment <chr>, city <chr>, state <chr>, country <chr>, region <chr>,
## # product_id <chr>, category <chr>, sub_category <chr>, product_name <chr>,
## # sales <dbl>, quantity <dbl>, discount <dbl>, profit <dbl>
is.null(EUSuperstore)
## [1] FALSE
View(EUSuperstore)
Let’s analyze patterns in our Dataset
ggplotly(
ggplot(EUSuperstore, aes(x = Category, fill = Category)) +
geom_bar() +
labs(x = "Category", y = "Frequency",
title = "Frequency of Category"))
Analysis: This bar plot shows that the superstore has more Office Supplies than Technology or Furniture.
ggplotly(
ggplot(data = EUSuperstore, aes(x = `Sub-Category`, fill = `Sub-Category`)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90)) +
labs(x = "SUb Category", y = "Frequency", title = "Frequency of SUb-Category")
)
Analysis: This graph demonstrates that the superstore offers a large range of Art, Binder, and Storage items in Sub-Category.
ggplot(data = EUSuperstore, aes(x = `Ship Mode`, y = Sales, fill = `Ship Mode`)) +
geom_bar(stat = "identity")
ggplot(data = EUSuperstore, aes(x = Sales, y = Profit, color =`Ship Mode`)) + geom_point()
Analysis: And hence, obviously we see more profits/loss have been availed from the standard shipment class.But, there are not higher range profits seen this feature.
Let us see how Sales are affected if discounts are offered.
ggplot() + geom_point(data = EUSuperstore, aes(x = Discount, y = Sales, color = `Ship Mode`))
Let’s see whether profits have been triggered if discounts have been redeemed.
ggplot() + geom_bar(data = EUSuperstore, aes(x = Discount, y = Profit, fill = `Ship Mode`), stat = "identity")
ggplot() + geom_bar(data = EUSuperstore, aes(x = `Sub-Category`, y = Profit, fill = Region), stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
EUSuperstore |>
group_by(Region, `Sub-Category`) |>
summarise(total_sales = sum(Sales)) -> plot
plot |>
pivot_longer(Region,
names_to = "key",
values_to = "value") -> up_plot
ggplot(data = up_plot, aes(x = `Sub-Category`, y = total_sales, fill = value)) +
geom_col(position = "dodge") +
theme(axis.text.x = element_text(angle = 45))
ggplot() + geom_bar(data = EUSuperstore, aes(x = Category, y = Sales, fill = Region), stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot() + geom_bar(data = EUSuperstore, aes(x = Category, y = Profit, fill = Region), stat = "identity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
ggplot() + geom_col(data = EUSuperstore, aes(x = Sales, y = Profit, color = Category))
EUSuperstore |>
group_by(Region) |>
summarise(total_sales = sum(Sales)) -> pplot
pie(pplot$total_sales, labels = pplot$Region)
cost_eu_superstore <- EUSuperstore |>
mutate(cost = Sales - Profit) |>
group_by(`Sub-Category`) |>
summarise(total_cost = sum(cost))
ggplotly(
ggplot(cost_eu_superstore, aes(x = `Sub-Category`, y = total_cost, fill = `Sub-Category`)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 45)) +
labs(x = "Sub Category", y = "Total Cost", title = "Total Costs in Each Sub Category")
)
Analysis: Copier, phone, storage, and bookshelf costs are higher for retailers.
ggplotly(
ggplot(EUSuperstore, aes(x = Segment, fill = Segment)) +
geom_bar() +
labs(x = "Segment", y = "Frequency", title = "Popularity of Segment")
)
Analysis: The consumer segment is more appealing.
prof_segment_subcat <- EUSuperstore |>
select(Segment, `Sub-Category`, Profit) |>
group_by(Segment, `Sub-Category`) |>
summarise(total_prof = sum(Profit)) |>
pivot_longer(Segment,
values_to = "segment")
ggplotly(
ggplot(prof_segment_subcat, aes(x = `Sub-Category`, fill = segment, y = total_prof)) +
geom_bar(stat = "identity", position = "dodge") +
theme(axis.text.x = element_text(angle = 45)) +
labs(x = "Sub Category", y = "Total Profit", title = "Total Profit of Sub Category in Each Segment")
)
Analysis: Because the profit rate of consumers in each sub-category is higher in the graph above, we may devote more attention to this sector than to Corporate and Home Office.
EUSuperstore |>
mutate(given_discount = Discount > 0) |>
mutate(discount_amount = ifelse(given_discount,
(Sales/(1 - Discount)) - Sales,
0)) |>
group_by(`Order ID`) |>
summarise(invoice_value = sum(Sales),
discount_value = sum(discount_amount)) |>
mutate(discount_given = discount_value > 0) -> transaction_discount
ggplotly(
ggplot(data = transaction_discount, aes(y = invoice_value, x = discount_given, color = discount_given)) +
geom_boxplot() +
labs(x = "Is Discount Given", y = "Invoice Value", title = "Effect of Invoice Values After Giving Discount")
)
Analysis: The discount applied to a specific item on each invoice is advantageous to us since it raises the invoice value. It’s also conceivable that the number of discounts granted in each transaction is significantly higher than the number of discounts not given. As a result, the transaction values of the discount granted will appear to grow, resulting in bias. As a result, we must examine the fraction of transactions with and without discounts.
ggplotly(
ggplot(data = transaction_discount, aes(x = discount_given, fill = discount_given)) +
geom_bar() +
labs(y = "Frequency", x = "Is Discount Given", title = "Count of Transaction in Which Discount Given")
)
Analysis: So our earlier boxplot analysis was right! The discount applied to a specific item on each invoice is advantageous to us since it raises the invoice value.
ggplotly(
ggplot(EUSuperstore, aes(x = Sales, y = Profit)) +
geom_hex() +
geom_smooth() +
labs(title = "Relationship Between Sales and Profit")
)
Analysis: The Profit increases as the Sales increase.
EUSuperstore |>
group_by(Country) |>
dplyr::summarise(total_sales = sum(Sales)) |>
arrange(desc(total_sales)) -> countrywise_sales
data.table::data.table(countrywise_sales)
## Country total_sales
## 1: France 858931.083
## 2: Germany 628840.030
## 3: United Kingdom 528576.300
## 4: Italy 289709.658
## 5: Spain 287146.680
## 6: Austria 81162.000
## 7: Netherlands 77514.945
## 8: Belgium 49226.700
## 9: Sweden 30491.403
## 10: Switzerland 24877.860
## 11: Finland 20704.350
## 12: Norway 20525.370
## 13: Ireland 16639.509
## 14: Portugal 15105.120
## 15: Denmark 8638.053
worldmap <- map_data("world")
colnames(countrywise_sales) = c("region", "values")
country_map <- countrywise_sales |>
left_join(worldmap, countrywise_sales, by = "region")
country_map <- country_map |>
filter(!is.na(country_map$values))
centroid <- aggregate(cbind(long, lat) ~ region, data = country_map, FUN = mean)
ggplotly(
ggplot(data = country_map, aes(x = long, y = lat)) +
geom_polygon(aes(fill = values), alpha = 0.7) +
geom_text(data = centroid, aes(x = long, y = lat, label = region)) +
coord_equal() +
labs(title = "Geographical Distribution of Sales Value")
)
Analysis: Because the value of sales in France, the United Kingdom, and Germany is higher, we should extend our product offerings in these nations.
Same day shipment if receives more discounts can trigger sales/profits. Discounts should be based on the Sales and should not increase a particular range otherwise unnecessary discounts with low sales can witness huge losses. Binders and Machines industry should be focused upon more so as to strengthen these weakened industry areas.Office Supplies and the Furniture industries do not seem to boom in the Central Region.